import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'

Dataset creation¶
hiv_df = pd.read_csv(r'HIV data 2000-2023.csv', encoding='ISO-8859-1')
hiv_df.head()
| IndicatorCode | Indicator | ValueType | ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period type | Period | Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2023 | 320 000 [280 000 - 380 000] |
| 1 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2022 | 320 000 [280 000 - 380 000] |
| 2 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2021 | 320 000 [280 000 - 380 000] |
| 3 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2020 | 320 000 [280 000 - 370 000] |
| 4 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2015 | 300 000 [260 000 - 350 000] |
Attributes¶
IndicatorCode: A unique identifier for the indicator being measured (e.g.,
"HIV_0000000001"for the estimated number of people living with HIV).Indicator: A description of the indicator being measured (e.g.,
"Estimated number of people (all ages) living with HIV").ValueType: Specifies the type of data recorded (e.g.,
"numeric"for numerical values).ParentLocationCode: A code representing the broader geographical region to which the location belongs (e.g.,
"AFR"for Africa).ParentLocation: The name of the broader geographical region (e.g.,
"Africa").Location type: Describes the type of location (e.g.,
"Country").SpatialDimValueCode: A unique code for the specific location (e.g.,
"AGO"for Angola).Location: The name of the specific location (e.g.,
"Angola").Period type: Specifies the type of time period (e.g.,
"Year").Period: The year for which the data is recorded (e.g.,
"2023").Value: The estimated number of people living with HIV, often including a range (e.g.,
"320 000 [280 000 - 380 000]"for Angola in 2023).
This dataset provides detailed information on HIV prevalence across various countries and regions, with data spanning multiple years (2000–2023). The Value field is particularly important as it contains the estimated figures, often accompanied by confidence intervals. The dataset is structured to allow analysis by region, country, and year.
Since the dataset focuses on an estimated number of people living with HIV, the columns IndicatorCode, Indicator, ValueType, Location type and Period type are redundant and do not add analytical value to trend analysis hence we drop them
# deleting redundant columns that do not add analytic value
del hiv_df['IndicatorCode']
del hiv_df['Indicator']
del hiv_df['ValueType']
del hiv_df['Location type']
del hiv_df['Period type']
hiv_df
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|
| 0 | AFR | Africa | AGO | Angola | 2023 | 320 000 [280 000 - 380 000] |
| 1 | AFR | Africa | AGO | Angola | 2022 | 320 000 [280 000 - 380 000] |
| 2 | AFR | Africa | AGO | Angola | 2021 | 320 000 [280 000 - 380 000] |
| 3 | AFR | Africa | AGO | Angola | 2020 | 320 000 [280 000 - 370 000] |
| 4 | AFR | Africa | AGO | Angola | 2015 | 300 000 [260 000 - 350 000] |
| ... | ... | ... | ... | ... | ... | ... |
| 1547 | WPR | Western Pacific | WSM | Samoa | 2020 | No data |
| 1548 | WPR | Western Pacific | WSM | Samoa | 2015 | No data |
| 1549 | WPR | Western Pacific | WSM | Samoa | 2010 | No data |
| 1550 | WPR | Western Pacific | WSM | Samoa | 2005 | No data |
| 1551 | WPR | Western Pacific | WSM | Samoa | 2000 | No data |
1552 rows × 6 columns
hiv_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1552 entries, 0 to 1551 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ParentLocationCode 1552 non-null object 1 ParentLocation 1552 non-null object 2 SpatialDimValueCode 1552 non-null object 3 Location 1552 non-null object 4 Period 1552 non-null int64 5 Value 1552 non-null object dtypes: int64(1), object(5) memory usage: 72.9+ KB
Cleaning¶
# duplicates
hiv_df.duplicated().sum()
0
# nulls
hiv_df.isna().sum()
ParentLocationCode 0 ParentLocation 0 SpatialDimValueCode 0 Location 0 Period 0 Value 0 dtype: int64
There are no duplicates nor null values. However, some entries of the Value field have "No data" or placeholder values like "<200" for very small estimates so we need to clean it to have single values
# entries with the value "No data"
no_data = hiv_df[hiv_df['Value'] == 'No data']
no_data
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|
| 40 | AFR | Africa | CAF | Central African Republic | 2023 | No data |
| 41 | AFR | Africa | CAF | Central African Republic | 2022 | No data |
| 42 | AFR | Africa | CAF | Central African Republic | 2021 | No data |
| 43 | AFR | Africa | CAF | Central African Republic | 2020 | No data |
| 44 | AFR | Africa | CAF | Central African Republic | 2015 | No data |
| ... | ... | ... | ... | ... | ... | ... |
| 1547 | WPR | Western Pacific | WSM | Samoa | 2020 | No data |
| 1548 | WPR | Western Pacific | WSM | Samoa | 2015 | No data |
| 1549 | WPR | Western Pacific | WSM | Samoa | 2010 | No data |
| 1550 | WPR | Western Pacific | WSM | Samoa | 2005 | No data |
| 1551 | WPR | Western Pacific | WSM | Samoa | 2000 | No data |
394 rows × 6 columns
# countries with No data on people living with HIV
no_data['Location'].unique()
array(['Central African Republic', 'Cameroon', 'Equatorial Guinea',
'Sao Tome and Principe', 'Seychelles', 'Antigua and Barbuda',
'Canada', 'Dominica', 'Grenada', 'Saint Kitts and Nevis',
'Saint Lucia', 'Trinidad and Tobago', 'United States of America',
'Saint Vincent and the Grenadines', 'Bahrain', 'Andorra',
'Austria', 'Belgium', 'Cyprus', 'Germany', 'Finland',
'United Kingdom of Great Britain and Northern Ireland', 'Hungary',
'Monaco', 'Netherlands (Kingdom of the)', 'Norway', 'Poland',
'Russian Federation', 'San Marino', 'Sweden', 'Turkmenistan',
'T\x9frkiye', 'Ukraine', 'Uzbekistan', 'India', 'Maldives',
"Democratic People's Republic of Korea", 'Brunei Darussalam',
'China', 'Cook Islands', 'Micronesia (Federated States of)',
'Japan', 'Kiribati', 'Republic of Korea', 'Marshall Islands',
'Niue', 'Nauru', 'Palau', 'Solomon Islands', 'Tonga', 'Tuvalu',
'Vanuatu', 'Samoa'], dtype=object)
# cleaning our dataset entries with the value "No data"
hiv_df = hiv_df[hiv_df['Value'] != 'No data']
hiv_df
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|
| 0 | AFR | Africa | AGO | Angola | 2023 | 320 000 [280 000 - 380 000] |
| 1 | AFR | Africa | AGO | Angola | 2022 | 320 000 [280 000 - 380 000] |
| 2 | AFR | Africa | AGO | Angola | 2021 | 320 000 [280 000 - 380 000] |
| 3 | AFR | Africa | AGO | Angola | 2020 | 320 000 [280 000 - 370 000] |
| 4 | AFR | Africa | AGO | Angola | 2015 | 300 000 [260 000 - 350 000] |
| ... | ... | ... | ... | ... | ... | ... |
| 1531 | WPR | Western Pacific | VNM | Viet Nam | 2020 | 250 000 [230 000 - 270 000] |
| 1532 | WPR | Western Pacific | VNM | Viet Nam | 2015 | 240 000 [210 000 - 260 000] |
| 1533 | WPR | Western Pacific | VNM | Viet Nam | 2010 | 210 000 [190 000 - 230 000] |
| 1534 | WPR | Western Pacific | VNM | Viet Nam | 2005 | 180 000 [150 000 - 200 000] |
| 1535 | WPR | Western Pacific | VNM | Viet Nam | 2000 | 120 000 [100 000 - 140 000] |
1158 rows × 6 columns
We have dropped 394 columns with 'No Data' as their value for people living with HIV
# extract the central estimate (320000) as a number for values with the format: 320 000 [280 000 - 380 000]
def extract_value(val):
if isinstance(val, str):
# Handle values like "<500" at the beginning
if val.startswith('<'):
number = int(val[1:].split()[0]) # take just the number part before any space
return number - 1 # assume just under that number
else:
# Extract the number at the start if it's not a "<" value
match = pd.Series(val).str.extract(r'^([\d\s]+)').iloc[0, 0]
if match:
return float(match.replace(' ', '')) # remove spaces, convert to float
return None # fallback if no match
# Apply to value column and convert to int
hiv_df.loc[:, 'Value'] = hiv_df['Value'].apply(extract_value)
hiv_df['Value'] = hiv_df['Value'].astype(int)
hiv_df
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\1950752774.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|
| 0 | AFR | Africa | AGO | Angola | 2023 | 320000 |
| 1 | AFR | Africa | AGO | Angola | 2022 | 320000 |
| 2 | AFR | Africa | AGO | Angola | 2021 | 320000 |
| 3 | AFR | Africa | AGO | Angola | 2020 | 320000 |
| 4 | AFR | Africa | AGO | Angola | 2015 | 300000 |
| ... | ... | ... | ... | ... | ... | ... |
| 1531 | WPR | Western Pacific | VNM | Viet Nam | 2020 | 250000 |
| 1532 | WPR | Western Pacific | VNM | Viet Nam | 2015 | 240000 |
| 1533 | WPR | Western Pacific | VNM | Viet Nam | 2010 | 210000 |
| 1534 | WPR | Western Pacific | VNM | Viet Nam | 2005 | 180000 |
| 1535 | WPR | Western Pacific | VNM | Viet Nam | 2000 | 120000 |
1158 rows × 6 columns
hiv_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1158 entries, 0 to 1535 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ParentLocationCode 1158 non-null object 1 ParentLocation 1158 non-null object 2 SpatialDimValueCode 1158 non-null object 3 Location 1158 non-null object 4 Period 1158 non-null int64 5 Value 1158 non-null int32 dtypes: int32(1), int64(1), object(4) memory usage: 58.8+ KB

Question One¶
Create a visualization that shows the trend of HIV cases in the countries that contribute to 75% of the global burden¶
We'll first group the cleaned dataset by Location and Period, summing up Value.
# 1. Group by year and country to get total HIV cases per country per year
country_year_df = hiv_df.groupby(['Location', 'Period'])['Value'].sum().reset_index()
country_year_df
| Location | Period | Value | |
|---|---|---|---|
| 0 | Afghanistan | 2000 | 1600 |
| 1 | Afghanistan | 2005 | 2800 |
| 2 | Afghanistan | 2010 | 4100 |
| 3 | Afghanistan | 2015 | 6500 |
| 4 | Afghanistan | 2020 | 10000 |
| ... | ... | ... | ... |
| 1153 | Zimbabwe | 2015 | 1400000 |
| 1154 | Zimbabwe | 2020 | 1300000 |
| 1155 | Zimbabwe | 2021 | 1300000 |
| 1156 | Zimbabwe | 2022 | 1300000 |
| 1157 | Zimbabwe | 2023 | 1300000 |
1158 rows × 3 columns
We then calculate Total Global HIV Cases Per Year. This will eventually help us in determining each country’s contribution to the global burden for each year.
# 2. Total global cases per year
global_yearly_total = country_year_df.groupby('Period')['Value'].sum().reset_index()
global_yearly_total.rename(columns={'Value': 'GlobalTotal'}, inplace=True)
global_yearly_total
| Period | GlobalTotal | |
|---|---|---|
| 0 | 2000 | 21353916 |
| 1 | 2005 | 23492604 |
| 2 | 2010 | 26058607 |
| 3 | 2015 | 28925842 |
| 4 | 2020 | 31016146 |
| 5 | 2021 | 31314227 |
| 6 | 2022 | 31198307 |
| 7 | 2023 | 33933987 |
Merge global total and find country percentage
For each year, we already know the total number of HIV cases globally (
GlobalTotal).Now, we add that total next to each country’s number for that year.
Then, we calculate what percentage each country contributes that year:
Example: If Kenya had 100,000 cases and the world had 1,000,000 cases that year → Kenya’s share = 10%.
# 3. Merge global total back to country-level data
merged_df = country_year_df.merge(global_yearly_total, on='Period')
merged_df['Percent'] = merged_df['Value'] / merged_df['GlobalTotal'] * 100
merged_df
| Location | Period | Value | GlobalTotal | Percent | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2000 | 1600 | 21353916 | 0.007493 |
| 1 | Afghanistan | 2005 | 2800 | 23492604 | 0.011919 |
| 2 | Afghanistan | 2010 | 4100 | 26058607 | 0.015734 |
| 3 | Afghanistan | 2015 | 6500 | 28925842 | 0.022471 |
| 4 | Afghanistan | 2020 | 10000 | 31016146 | 0.032241 |
| ... | ... | ... | ... | ... | ... |
| 1153 | Zimbabwe | 2015 | 1400000 | 28925842 | 4.839963 |
| 1154 | Zimbabwe | 2020 | 1300000 | 31016146 | 4.191365 |
| 1155 | Zimbabwe | 2021 | 1300000 | 31314227 | 4.151468 |
| 1156 | Zimbabwe | 2022 | 1300000 | 31198307 | 4.166893 |
| 1157 | Zimbabwe | 2023 | 1300000 | 33933987 | 3.830967 |
1158 rows × 5 columns
Instead of picking the biggest contributors only based on today (latest year) or on average across years, we are now looking at the total number of cases a country has contributed from 2000 to 2023.
This accumulation over time is the true burden we are measuring, it telling us which countries have been the biggest contributors overall — not just recently, and favors countries that had high numbers consistently across many years, not just a sudden rise or fall.
# 4. Sum total cases per country across all years
country_total = merged_df.groupby('Location')['Value'].sum().reset_index()
global_total_sum = global_yearly_total['GlobalTotal'].sum()
country_total
| Location | Value | |
|---|---|---|
| 0 | Afghanistan | 61000 |
| 1 | Albania | 8208 |
| 2 | Algeria | 126100 |
| 3 | Angola | 2160000 |
| 4 | Argentina | 859000 |
| ... | ... | ... |
| 142 | Venezuela (Bolivarian Republic of) | 724000 |
| 143 | Viet Nam | 1750000 |
| 144 | Yemen | 78100 |
| 145 | Zambia | 8890000 |
| 146 | Zimbabwe | 10900000 |
147 rows × 2 columns
We then find the percentage each country contributed to this global total
# 5. Calculate percent contribution of each country
country_total['Percent'] = country_total['Value'] / global_total_sum * 100
country_total
| Location | Value | Percent | |
|---|---|---|---|
| 0 | Afghanistan | 61000 | 0.026838 |
| 1 | Albania | 8208 | 0.003611 |
| 2 | Algeria | 126100 | 0.055479 |
| 3 | Angola | 2160000 | 0.950313 |
| 4 | Argentina | 859000 | 0.377925 |
| ... | ... | ... | ... |
| 142 | Venezuela (Bolivarian Republic of) | 724000 | 0.318531 |
| 143 | Viet Nam | 1750000 | 0.769929 |
| 144 | Yemen | 78100 | 0.034361 |
| 145 | Zambia | 8890000 | 3.911240 |
| 146 | Zimbabwe | 10900000 | 4.795559 |
147 rows × 3 columns
Filter Top Contributors Covering 75% of Global Burden by sorting countries by their contribution and cumulatively sum their % share until reaching 75%.
# 6. Sort and compute cumulative percent
country_total = country_total.sort_values('Percent', ascending=False)
country_total['CumulativePercent'] = country_total['Percent'].cumsum()
# 7. Select countries contributing to 75% of total global cases
top_countries = country_total[country_total['CumulativePercent'] <= 75]['Location'].tolist()
top_countries
['South Africa', 'Mozambique', 'Nigeria', 'Kenya', 'United Republic of Tanzania', 'Zimbabwe', 'Uganda', 'Zambia', 'Malawi', 'Brazil', 'Thailand', 'Ethiopia', 'Democratic Republic of the Congo', "Cote d'Ivoire"]
# 8. Filter original country-year data for those top countries
top_country_trend = country_year_df[country_year_df['Location'].isin(top_countries)]
# 9. Plot the trend
fig = px.line(top_country_trend,
x='Period',
y='Value',
color='Location',
title='Trend of HIV Cases in Top Contributing Countries (75% of Cumulative Global Cases)',
labels={'Value': 'Estimated HIV Cases', 'Period': 'Year'})
fig.show()
Generate a visualization that displays the trend of HIV cases in the countries contributing to 75% of the burden within each WHO region (column called ParentLocationCode contains the WHO regions)¶
Instead of asking "Who are the biggest players globally?" we are now trying to find out, "Who are the biggest players inside each WHO region?".
First and foremost, for each WHO region, country, and year, we sum the HIV cases.
# 1. Group by year, country, and WHO region to get total HIV cases per country per year
country_year_df = hiv_df.groupby(['ParentLocationCode', 'Location', 'Period'])['Value'].sum().reset_index()
country_year_df
| ParentLocationCode | Location | Period | Value | |
|---|---|---|---|---|
| 0 | AFR | Algeria | 2000 | 2400 |
| 1 | AFR | Algeria | 2005 | 5700 |
| 2 | AFR | Algeria | 2010 | 10000 |
| 3 | AFR | Algeria | 2015 | 15000 |
| 4 | AFR | Algeria | 2020 | 21000 |
| ... | ... | ... | ... | ... |
| 1153 | WPR | Viet Nam | 2015 | 240000 |
| 1154 | WPR | Viet Nam | 2020 | 250000 |
| 1155 | WPR | Viet Nam | 2021 | 250000 |
| 1156 | WPR | Viet Nam | 2022 | 250000 |
| 1157 | WPR | Viet Nam | 2023 | 250000 |
1158 rows × 4 columns
Now, for each WHO region and year, we find the total HIV cases (summing all countries in the region) so that we can find out each country's share within its region later on
# 2. Total regional cases per year
regional_yearly_total = country_year_df.groupby(['ParentLocationCode', 'Period'])['Value'].sum().reset_index()
regional_yearly_total.rename(columns={'Value': 'RegionalTotal'}, inplace=True)
regional_yearly_total
| ParentLocationCode | Period | RegionalTotal | |
|---|---|---|---|
| 0 | AFR | 2000 | 18088999 |
| 1 | AFR | 2005 | 19435099 |
| 2 | AFR | 2010 | 21219799 |
| 3 | AFR | 2015 | 23436199 |
| 4 | AFR | 2020 | 24838099 |
| 5 | AFR | 2021 | 25025199 |
| 6 | AFR | 2022 | 25102299 |
| 7 | AFR | 2023 | 25199599 |
| 8 | AMR | 2000 | 1152900 |
| 9 | AMR | 2005 | 1459800 |
| 10 | AMR | 2010 | 1773900 |
| 11 | AMR | 2015 | 2091600 |
| 12 | AMR | 2020 | 2455300 |
| 13 | AMR | 2021 | 2480700 |
| 14 | AMR | 2022 | 2558000 |
| 15 | AMR | 2023 | 2631200 |
| 16 | EMR | 2000 | 98893 |
| 17 | EMR | 2005 | 158135 |
| 18 | EMR | 2010 | 243666 |
| 19 | EMR | 2015 | 313167 |
| 20 | EMR | 2020 | 422730 |
| 21 | EMR | 2021 | 452150 |
| 22 | EMR | 2022 | 486170 |
| 23 | EMR | 2023 | 529280 |
| 24 | EUR | 2000 | 577537 |
| 25 | EUR | 2005 | 746342 |
| 26 | EUR | 2010 | 937863 |
| 27 | EUR | 2015 | 1057376 |
| 28 | EUR | 2020 | 1165097 |
| 29 | EUR | 2021 | 1184638 |
| 30 | EUR | 2022 | 864688 |
| 31 | EUR | 2023 | 849238 |
| 32 | SEAR | 2000 | 1147589 |
| 33 | SEAR | 2005 | 1312330 |
| 34 | SEAR | 2010 | 1427320 |
| 35 | SEAR | 2015 | 1478780 |
| 36 | SEAR | 2020 | 1481300 |
| 37 | SEAR | 2021 | 1491400 |
| 38 | SEAR | 2022 | 1482600 |
| 39 | SEAR | 2023 | 3982700 |
| 40 | WPR | 2000 | 287998 |
| 41 | WPR | 2005 | 380898 |
| 42 | WPR | 2010 | 456059 |
| 43 | WPR | 2015 | 548720 |
| 44 | WPR | 2020 | 653620 |
| 45 | WPR | 2021 | 680140 |
| 46 | WPR | 2022 | 704550 |
| 47 | WPR | 2023 | 741970 |
We then find the percentage each country contributed, each year inside its region.
# 3. Merge regional total back to country-level data
merged_df = country_year_df.merge(regional_yearly_total, on=['ParentLocationCode', 'Period'])
merged_df['Percent'] = merged_df['Value'] / merged_df['RegionalTotal'] * 100
merged_df
| ParentLocationCode | Location | Period | Value | RegionalTotal | Percent | |
|---|---|---|---|---|---|---|
| 0 | AFR | Algeria | 2000 | 2400 | 18088999 | 0.013268 |
| 1 | AFR | Algeria | 2005 | 5700 | 19435099 | 0.029328 |
| 2 | AFR | Algeria | 2010 | 10000 | 21219799 | 0.047126 |
| 3 | AFR | Algeria | 2015 | 15000 | 23436199 | 0.064004 |
| 4 | AFR | Algeria | 2020 | 21000 | 24838099 | 0.084548 |
| ... | ... | ... | ... | ... | ... | ... |
| 1153 | WPR | Viet Nam | 2015 | 240000 | 548720 | 43.738154 |
| 1154 | WPR | Viet Nam | 2020 | 250000 | 653620 | 38.248524 |
| 1155 | WPR | Viet Nam | 2021 | 250000 | 680140 | 36.757138 |
| 1156 | WPR | Viet Nam | 2022 | 250000 | 704550 | 35.483642 |
| 1157 | WPR | Viet Nam | 2023 | 250000 | 741970 | 33.694085 |
1158 rows × 6 columns
For each country and region, sum all HIV cases across all years (2000–2023). This tells US how big a contributor the country is overall, not just in a year.
# 4. Sum Cases Across All Years
total_contribution = merged_df.groupby(['ParentLocationCode', 'Location'])['Value'].sum().reset_index()
total_contribution
| ParentLocationCode | Location | Value | |
|---|---|---|---|
| 0 | AFR | Algeria | 126100 |
| 1 | AFR | Angola | 2160000 |
| 2 | AFR | Benin | 542000 |
| 3 | AFR | Botswana | 2680000 |
| 4 | AFR | Burkina Faso | 901000 |
| ... | ... | ... | ... |
| 142 | WPR | New Zealand | 22600 |
| 143 | WPR | Papua New Guinea | 392000 |
| 144 | WPR | Philippines | 686400 |
| 145 | WPR | Singapore | 45000 |
| 146 | WPR | Viet Nam | 1750000 |
147 rows × 3 columns
We then calculate the Total Sum per Region
# 5. Get total sum per region for percentage calculation
regional_total_sum = total_contribution.groupby('ParentLocationCode')['Value'].sum().reset_index()
regional_total_sum.rename(columns={'Value': 'RegionalTotalSum'}, inplace=True)
regional_total_sum
| ParentLocationCode | RegionalTotalSum | |
|---|---|---|
| 0 | AFR | 182345292 |
| 1 | AMR | 16603400 |
| 2 | EMR | 2704191 |
| 3 | EUR | 7382779 |
| 4 | SEAR | 13804019 |
| 5 | WPR | 4453955 |
We merge the total region sums into country-level sums then calculate what % each country contributed overall to its WHO region (across all years).
# 6. Merge to calculate each country's % contribution within its region
total_contribution = total_contribution.merge(regional_total_sum, on='ParentLocationCode')
total_contribution['Percent'] = total_contribution['Value'] / total_contribution['RegionalTotalSum'] * 100
total_contribution
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | |
|---|---|---|---|---|---|
| 0 | AFR | Algeria | 126100 | 182345292 | 0.069155 |
| 1 | AFR | Angola | 2160000 | 182345292 | 1.184566 |
| 2 | AFR | Benin | 542000 | 182345292 | 0.297238 |
| 3 | AFR | Botswana | 2680000 | 182345292 | 1.469739 |
| 4 | AFR | Burkina Faso | 901000 | 182345292 | 0.494118 |
| ... | ... | ... | ... | ... | ... |
| 142 | WPR | New Zealand | 22600 | 4453955 | 0.507414 |
| 143 | WPR | Papua New Guinea | 392000 | 4453955 | 8.801167 |
| 144 | WPR | Philippines | 686400 | 4453955 | 15.411022 |
| 145 | WPR | Singapore | 45000 | 4453955 | 1.010338 |
| 146 | WPR | Viet Nam | 1750000 | 4453955 | 39.290922 |
147 rows × 5 columns
Sort countries inside each WHO region from highest contributor to lowest, then cumulatively add up percentages within each region.
# 7. Sort and get cumulative percentage
total_contribution = total_contribution.sort_values(['ParentLocationCode', 'Percent'], ascending=[True, False])
total_contribution['CumulativePercent'] = total_contribution.groupby('ParentLocationCode')['Percent'].cumsum()
total_contribution
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | CumulativePercent | |
|---|---|---|---|---|---|---|
| 35 | AFR | South Africa | 51400000 | 182345292 | 28.188279 | 28.188279 |
| 28 | AFR | Mozambique | 15150000 | 182345292 | 8.308413 | 36.496692 |
| 31 | AFR | Nigeria | 15000000 | 182345292 | 8.226152 | 44.722844 |
| 20 | AFR | Kenya | 12000000 | 182345292 | 6.580921 | 51.303765 |
| 39 | AFR | United Republic of Tanzania | 11600000 | 182345292 | 6.361557 | 57.665322 |
| ... | ... | ... | ... | ... | ... | ... |
| 139 | WPR | Lao People's Democratic Republic | 104400 | 4453955 | 2.343984 | 98.133008 |
| 145 | WPR | Singapore | 45000 | 4453955 | 1.010338 | 99.143346 |
| 142 | WPR | New Zealand | 22600 | 4453955 | 0.507414 | 99.650760 |
| 138 | WPR | Fiji | 11358 | 4453955 | 0.255009 | 99.905769 |
| 141 | WPR | Mongolia | 4197 | 4453955 | 0.094231 | 100.000000 |
147 rows × 6 columns
Keep only the top countries whose cumulative contribution is <= 75% in each WHO region.
# 8. Select countries contributing to 75% within each region
top_countries_per_region = (
total_contribution[total_contribution['CumulativePercent'] <= 75]
)
top_countries_per_region
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | CumulativePercent | |
|---|---|---|---|---|---|---|
| 35 | AFR | South Africa | 51400000 | 182345292 | 28.188279 | 28.188279 |
| 28 | AFR | Mozambique | 15150000 | 182345292 | 8.308413 | 36.496692 |
| 31 | AFR | Nigeria | 15000000 | 182345292 | 8.226152 | 44.722844 |
| 20 | AFR | Kenya | 12000000 | 182345292 | 6.580921 | 51.303765 |
| 39 | AFR | United Republic of Tanzania | 11600000 | 182345292 | 6.361557 | 57.665322 |
| 41 | AFR | Zimbabwe | 10900000 | 182345292 | 5.977670 | 63.642992 |
| 38 | AFR | Uganda | 10500000 | 182345292 | 5.758306 | 69.401298 |
| 40 | AFR | Zambia | 8890000 | 182345292 | 4.875366 | 74.276664 |
| 47 | AMR | Brazil | 6070000 | 16603400 | 36.558777 | 36.558777 |
| 61 | AMR | Mexico | 2190000 | 16603400 | 13.190070 | 49.748847 |
| 50 | AMR | Colombia | 1430000 | 16603400 | 8.612694 | 58.361540 |
| 58 | AMR | Haiti | 1010000 | 16603400 | 6.083091 | 64.444632 |
| 42 | AMR | Argentina | 859000 | 16603400 | 5.173639 | 69.618271 |
| 68 | AMR | Venezuela (Bolivarian Republic of) | 724000 | 16603400 | 4.360553 | 73.978824 |
| 80 | EMR | Pakistan | 1247600 | 2704191 | 46.135794 | 46.135794 |
| 72 | EMR | Iran (Islamic Republic of) | 327000 | 2704191 | 12.092341 | 58.228136 |
| 84 | EMR | Sudan | 273000 | 2704191 | 10.095441 | 68.323576 |
| 71 | EMR | Egypt | 154400 | 2704191 | 5.709656 | 74.033232 |
| 125 | EUR | Ukraine | 1430000 | 7382779 | 19.369400 | 19.369400 |
| 99 | EUR | France | 1285000 | 7382779 | 17.405370 | 36.774770 |
| 122 | EUR | Spain | 1090000 | 7382779 | 14.764088 | 51.538858 |
| 106 | EUR | Italy | 991000 | 7382779 | 13.423130 | 64.961988 |
| 116 | EUR | Portugal | 344000 | 7382779 | 4.659492 | 69.621480 |
| 101 | EUR | Germany | 323000 | 7382779 | 4.375046 | 73.996526 |
| 134 | SEAR | Thailand | 5430000 | 13804019 | 39.336370 | 39.336370 |
| 130 | SEAR | Indonesia | 3513000 | 13804019 | 25.449110 | 64.785480 |
| 146 | WPR | Viet Nam | 1750000 | 4453955 | 39.290922 | 39.290922 |
| 144 | WPR | Philippines | 686400 | 4453955 | 15.411022 | 54.701945 |
| 137 | WPR | Cambodia | 639000 | 4453955 | 14.346800 | 69.048744 |
# 9. Merge ParentLocation (full name) into top_countries_per_region
# get ParentLocation mappings from hiv_df
region_names = hiv_df[['ParentLocationCode', 'ParentLocation']].drop_duplicates()
# Merge region names into your top_countries_per_region
top_countries_per_region = top_countries_per_region.merge(region_names, on='ParentLocationCode', how='left')
# 10. Now filter your original merged_df for only the top countries
top_country_trend = merged_df[
merged_df['Location'].isin(top_countries_per_region['Location'])
]
# Also merge to get ParentLocation for plotting
top_country_trend = top_country_trend.merge(region_names, on='ParentLocationCode', how='left')
# 9. Plot the trend
fig = px.line(
top_country_trend,
x='Period',
y='Value',
color='Location',
facet_col='ParentLocation', # <--- This will separate by WHO region nicely
facet_col_wrap=2, # Wrap facets into multiple rows if too many regions
title='Trend of HIV Cases in Top Contributing Countries (75% Global Burden per Region)',
labels={
'Value': 'Estimated HIV Cases',
'Period': 'Year',
'Location': 'Country'
}
)
fig.update_layout(height=800)
fig.show()
Merge the World Bank data on multidimensional poverty headcount ratio with the HIV data above and analyze the relationship between people living with HIV and multidimensional poverty, and the individual factors that contribute to the ratio. Remember to account for the random effects (country, year).¶
# Load the poverty data
poverty_df = pd.read_excel('multidimensional_poverty.xlsx')
poverty_df
| Region | Country code | Economy | Reporting year | Survey name | Survey year | Survey coverage | Welfare type | Survey comparability | Monetary (%) | Educational attainment (%) | Educational enrollment (%) | Electricity (%) | Sanitation (%) | Drinking water (%) | Multidimensional poverty headcount ratio (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SSA | AGO | Angola | 2018 | IDREA | 2018 | N | c | 2 | 31.122005 | 29.753423 | 27.44306 | 52.639532 | 53.637516 | 32.106507 | 47.203606 |
| 1 | ECA | ALB | Albania | 2012 | HBS | 2018 | N | c | 1 | 0.048107 | 0.19238 | - | 0.06025 | 6.579772 | 9.594966 | 0.293161 |
| 2 | LAC | ARG | Argentina | 2010 | EPHC-S2 | 2021 | U | i | 3 | 0.894218 | 1.08532 | 0.731351 | 0 | 0.257453 | 0.364048 | 0.906573 |
| 3 | ECA | ARM | Armenia | 2010 | ILCS | 2021 | N | c | 1 | 0.523521 | 0 | 1.793004 | 0 | 0.397725 | 0.660082 | 0.523521 |
| 4 | EAP | AUS | Australia | 2010 | SIH-LIS | 2018 | N | I | 3 | 0.516880 | 1.71188 | - | 0 | 0 | - | 2.215770 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 105 | ECA | UZB | Uzbekistan | 2022 | HBS | 2022 | N | c | 1 | 2.253092 | 0 | - | 0.12747 | 21.786885 | 10.693686 | 2.253092 |
| 106 | EAP | VNM | Viet Nam | 2010 | VHLSS | 2022 | N | c | 2 | 0.963795 | 3.384816 | 1.841407 | 0.079733 | 4.132901 | 1.968127 | 1.266184 |
| 107 | EAP | VUT | Vanuatu | 2010 | NSDP | 2019 | N | c | 0 | 9.963333 | 25.723079 | 13.404277 | 26.994166 | 42.970088 | 11.813611 | 19.892171 |
| 108 | SSA | ZMB | Zambia | 2010 | LCMS-VIII | 2022 | N | c | 4 | 64.341974 | 16.267821 | 23.39835 | 45.135146 | 53.505135 | 26.849246 | 66.506058 |
| 109 | SSA | ZWE | Zimbabwe | 2017 | PICES | 2019 | N | c | 0 | 39.754534 | 0.927006 | 5.984225 | 37.994787 | 38.285229 | 19.260145 | 42.397931 |
110 rows × 16 columns
Attributes¶
Region: Geographic region of the country (e.g., Africa, Americas, Europe).
Country Code: Standardized 3-letter country code (e.g., AGO for Angola, BRA for Brazil).
Economy: Full name of the country (e.g. Angola).
Reporting Year: Year the data was officially reported (e.g., 2021, 2022)
Survey Name: Name of the survey or data source (e.g., "ENIGHNS" for Mexico, "EU-SILC" for European countries). Identifies the methodology or institution behind the data.
Survey Year: Year the survey was conducted.
Survey Coverage: Scope of the survey (e.g., national, subnational).
Welfare Type: Classification of welfare metrics (e.g., "c" for consumption-based, "i" for income-based).
Survey Comparability:Flags potential issues in comparing data across surveys.
Monetary (%):Percentage of the population below the monetary poverty line.
Educational Attainment (%): Percentage lacking minimum educational attainment (e.g., literacy, years of schooling).
Educational Enrollment (%): Percentage of children not enrolled in school
Electricity (%): Percentage without access to electricity.
Sanitation (%): Percentage without improved sanitation facilities.
Drinking Water (%): Percentage without access to clean drinking water.
Multidimensional Poverty Headcount Ratio (%): Percentage of the population experiencing multidimensional poverty (combined deprivations in health, education, and living standards).
Cleaning the Poverty dataset¶
poverty_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110 entries, 0 to 109 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 110 non-null object 1 Country code 110 non-null object 2 Economy 110 non-null object 3 Reporting year 110 non-null int64 4 Survey name 110 non-null object 5 Survey year 110 non-null int64 6 Survey coverage 110 non-null object 7 Welfare type 110 non-null object 8 Survey comparability 110 non-null int64 9 Monetary (%) 110 non-null float64 10 Educational attainment (%) 110 non-null object 11 Educational enrollment (%) 110 non-null object 12 Electricity (%) 110 non-null object 13 Sanitation (%) 110 non-null object 14 Drinking water (%) 110 non-null object 15 Multidimensional poverty headcount ratio (%) 110 non-null float64 dtypes: float64(2), int64(3), object(11) memory usage: 13.9+ KB
floats = ['Educational attainment (%)', 'Educational enrollment (%)', 'Electricity (%)', 'Sanitation (%)', 'Drinking water (%)', 'Multidimensional poverty headcount ratio (%)']
for n in floats:
poverty_df[n] = poverty_df[n].replace('-', np.nan)
poverty_df[n] = poverty_df[n].astype(float)
poverty_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110 entries, 0 to 109 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 110 non-null object 1 Country code 110 non-null object 2 Economy 110 non-null object 3 Reporting year 110 non-null int64 4 Survey name 110 non-null object 5 Survey year 110 non-null int64 6 Survey coverage 110 non-null object 7 Welfare type 110 non-null object 8 Survey comparability 110 non-null int64 9 Monetary (%) 110 non-null float64 10 Educational attainment (%) 109 non-null float64 11 Educational enrollment (%) 74 non-null float64 12 Electricity (%) 109 non-null float64 13 Sanitation (%) 85 non-null float64 14 Drinking water (%) 101 non-null float64 15 Multidimensional poverty headcount ratio (%) 110 non-null float64 dtypes: float64(7), int64(3), object(6) memory usage: 13.9+ KB
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\3493648953.py:4: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\3493648953.py:4: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\3493648953.py:4: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\3493648953.py:4: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
C:\Users\deninjo\AppData\Local\Temp\ipykernel_18780\3493648953.py:4: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
# duplicates
poverty_df.duplicated().sum()
0
# nulls
poverty_df.isna().sum()
Region 0 Country code 0 Economy 0 Reporting year 0 Survey name 0 Survey year 0 Survey coverage 0 Welfare type 0 Survey comparability 0 Monetary (%) 0 Educational attainment (%) 1 Educational enrollment (%) 36 Electricity (%) 1 Sanitation (%) 25 Drinking water (%) 9 Multidimensional poverty headcount ratio (%) 0 dtype: int64
Due to our small dataset, we will replace the nulls rather than deleting them
# plotting boxplots for null columns to check distribution
import plotly.graph_objects as go
from plotly.subplots import make_subplots
nulls = ['Educational attainment (%)', 'Educational enrollment (%)',
'Electricity (%)', 'Sanitation (%)', 'Drinking water (%)']
# Create a subplot grid with 3 columns and 2 rows (5 variables in a grid)
fig = make_subplots(
rows=2, cols=3, # 2 rows, 3 columns (last position will be empty)
subplot_titles=nulls, # Titles for each subplot
vertical_spacing=0.15, # Reduce space between rows
horizontal_spacing=0.1 # Reduce space between columns
)
# Add a box plot for each column in the 'nulls' list
positions = [(1,1), (1,2), (1,3), (2,1), (2,2)] # Positions in the grid
for (row, col), var in zip(positions, nulls):
fig.add_trace(
go.Box(
y=poverty_df[var],
name=var,
boxmean=True, # Shows the mean line in the box plot
showlegend=False # Hide legend for individual plots
),
row=row, col=col
)
# Hide the empty subplot (2,3)
fig.update_layout(
height=600, # Smaller height for more compact view
width=900, # Adjust width to maintain proportions
title_text="Box Plots for Various Poverty Factors",
margin=dict(t=50, b=20, l=20, r=20), # Reduce margins
# Adjust font sizes for compact display
font=dict(size=10),
title_font=dict(size=12),
# Make the boxes more compact
boxmode='group',
boxgap=0.3,
boxgroupgap=0.3
)
# Update subplot titles to be smaller
for annotation in fig['layout']['annotations']:
annotation['font'] = dict(size=10)
# Show the plot
fig.show()
The distribution has outliers hence it would not be appropriate to replace with mean but rather the median
# Replace null values with median for each column
for col in nulls:
median_value = poverty_df[col].median()
poverty_df[col] = poverty_df[col].fillna(median_value)
poverty_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110 entries, 0 to 109 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 110 non-null object 1 Country code 110 non-null object 2 Economy 110 non-null object 3 Reporting year 110 non-null int64 4 Survey name 110 non-null object 5 Survey year 110 non-null int64 6 Survey coverage 110 non-null object 7 Welfare type 110 non-null object 8 Survey comparability 110 non-null int64 9 Monetary (%) 110 non-null float64 10 Educational attainment (%) 110 non-null float64 11 Educational enrollment (%) 110 non-null float64 12 Electricity (%) 110 non-null float64 13 Sanitation (%) 110 non-null float64 14 Drinking water (%) 110 non-null float64 15 Multidimensional poverty headcount ratio (%) 110 non-null float64 dtypes: float64(7), int64(3), object(6) memory usage: 13.9+ KB
Comparing with the HIV datset¶
poverty_df.head()
| Region | Country code | Economy | Reporting year | Survey name | Survey year | Survey coverage | Welfare type | Survey comparability | Monetary (%) | Educational attainment (%) | Educational enrollment (%) | Electricity (%) | Sanitation (%) | Drinking water (%) | Multidimensional poverty headcount ratio (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SSA | AGO | Angola | 2018 | IDREA | 2018 | N | c | 2 | 31.122005 | 29.753423 | 27.443060 | 52.639532 | 53.637516 | 32.106507 | 47.203606 |
| 1 | ECA | ALB | Albania | 2012 | HBS | 2018 | N | c | 1 | 0.048107 | 0.192380 | 3.045111 | 0.060250 | 6.579772 | 9.594966 | 0.293161 |
| 2 | LAC | ARG | Argentina | 2010 | EPHC-S2 | 2021 | U | i | 3 | 0.894218 | 1.085320 | 0.731351 | 0.000000 | 0.257453 | 0.364048 | 0.906573 |
| 3 | ECA | ARM | Armenia | 2010 | ILCS | 2021 | N | c | 1 | 0.523521 | 0.000000 | 1.793004 | 0.000000 | 0.397725 | 0.660082 | 0.523521 |
| 4 | EAP | AUS | Australia | 2010 | SIH-LIS | 2018 | N | I | 3 | 0.516880 | 1.711880 | 3.045111 | 0.000000 | 0.000000 | 2.421072 | 2.215770 |
# Load the hiv data
hiv_df.tail()
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|
| 1531 | WPR | Western Pacific | VNM | Viet Nam | 2020 | 250000 |
| 1532 | WPR | Western Pacific | VNM | Viet Nam | 2015 | 240000 |
| 1533 | WPR | Western Pacific | VNM | Viet Nam | 2010 | 210000 |
| 1534 | WPR | Western Pacific | VNM | Viet Nam | 2005 | 180000 |
| 1535 | WPR | Western Pacific | VNM | Viet Nam | 2000 | 120000 |
Comparison
- In the HIV dataset,
SpatialDimValueCodecorresponds toCountry codein the poverty dataset - In the HIV dataset,
Locationcorresponds toEconomyin the poverty dataset - In the HIV dataset,
Periodcorresponds toReporting yearin the poverty dataset, the actual year in which data was recorded
# Merge the datasets on the common columns
merged_df = pd.merge(
hiv_df,
poverty_df,
left_on=['SpatialDimValueCode', 'Location', 'Period'],
right_on=['Country code', 'Economy', 'Reporting year'],
how='inner' # Use 'inner' for intersection of both datasets; change to 'outer' if you want to keep all rows
)
merged_df
| ParentLocationCode | ParentLocation | SpatialDimValueCode | Location | Period | Value | Region | Country code | Economy | Reporting year | ... | Survey coverage | Welfare type | Survey comparability | Monetary (%) | Educational attainment (%) | Educational enrollment (%) | Electricity (%) | Sanitation (%) | Drinking water (%) | Multidimensional poverty headcount ratio (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFR | Africa | BEN | Benin | 2015 | 71000 | SSA | BEN | Benin | 2015 | ... | N | c | 1 | 12.723279 | 49.023893 | 31.661823 | 34.818304 | 76.649594 | 24.109130 | 45.443240 |
| 1 | AFR | Africa | CIV | Cote d'Ivoire | 2015 | 460000 | SSA | CIV | Cote d'Ivoire | 2015 | ... | N | c | 2 | 9.733193 | 44.493997 | 24.717054 | 9.264950 | 59.909374 | 17.286547 | 29.177094 |
| 2 | AFR | Africa | GNB | Guinea-Bissau | 2010 | 37000 | SSA | GNB | Guinea-Bissau | 2010 | ... | N | c | 3 | 25.962856 | 20.071073 | 31.059781 | 27.570510 | 60.362864 | 20.897064 | 38.730049 |
| 3 | AFR | Africa | KEN | Kenya | 2015 | 1500000 | SSA | KEN | Kenya | 2015 | ... | N | c | 3 | 36.146057 | 10.055726 | 1.179968 | 24.500306 | 22.312516 | 36.060327 | 38.490102 |
| 4 | AFR | Africa | MWI | Malawi | 2010 | 930000 | SSA | MWI | Malawi | 2010 | ... | N | c | 1 | 70.060599 | 54.329389 | 3.686354 | 88.757396 | 75.137657 | 11.400399 | 78.252000 |
| 5 | AFR | Africa | ZMB | Zambia | 2010 | 940000 | SSA | ZMB | Zambia | 2010 | ... | N | c | 4 | 64.341974 | 16.267821 | 23.398350 | 45.135146 | 53.505135 | 26.849246 | 66.506058 |
| 6 | AMR | Americas | ARG | Argentina | 2010 | 92000 | LAC | ARG | Argentina | 2010 | ... | U | i | 3 | 0.894218 | 1.085320 | 0.731351 | 0.000000 | 0.257453 | 0.364048 | 0.906573 |
| 7 | AMR | Americas | COL | Colombia | 2010 | 140000 | LAC | COL | Colombia | 2010 | ... | N | i | 4 | 7.334666 | 5.054273 | 2.794198 | 1.110965 | 7.673390 | 1.721011 | 7.698172 |
| 8 | AMR | Americas | CRI | Costa Rica | 2010 | 9200 | LAC | CRI | Costa Rica | 2010 | ... | N | i | 3 | 1.242194 | 3.650702 | 0.452292 | 0.261438 | 1.557860 | 0.252125 | 1.320344 |
| 9 | AMR | Americas | DOM | Dominican Republic | 2010 | 74000 | LAC | DOM | Dominican Republic | 2010 | ... | N | i | 5 | 0.853729 | 12.177467 | 5.423051 | 0.474999 | 5.528953 | 4.678269 | 1.813273 |
| 10 | AMR | Americas | ECU | Ecuador | 2010 | 31000 | LAC | ECU | Ecuador | 2010 | ... | N | i | 7 | 3.583180 | 2.962833 | 2.339824 | 1.611026 | 4.997954 | 3.688032 | 4.252161 |
| 11 | AMR | Americas | HND | Honduras | 2010 | 25000 | LAC | HND | Honduras | 2010 | ... | N | i | 4 | 12.647659 | 10.125883 | 9.975495 | 6.744803 | 5.810140 | 5.680016 | 14.782456 |
| 12 | AMR | Americas | MEX | Mexico | 2010 | 210000 | LAC | MEX | Mexico | 2010 | ... | N | i | 0 | 1.179679 | 3.724001 | 2.672050 | 0.320671 | 10.676579 | 4.353976 | 1.740318 |
| 13 | AMR | Americas | PAN | Panama | 2010 | 19000 | LAC | PAN | Panama | 2010 | ... | N | i | 5 | 1.108257 | 2.424547 | 1.619710 | 4.764825 | 6.176661 | 4.484637 | 2.445591 |
| 14 | AMR | Americas | PER | Peru | 2010 | 61000 | LAC | PER | Peru | 2010 | ... | N | i | 3 | 2.773094 | 5.365242 | 1.210119 | 4.144808 | 12.159352 | 5.256491 | 4.120150 |
| 15 | AMR | Americas | PRY | Paraguay | 2010 | 11000 | LAC | PRY | Paraguay | 2010 | ... | N | i | 3 | 0.689179 | 4.447169 | 1.995930 | 0.134809 | 10.389888 | 1.107492 | 1.321316 |
| 16 | AMR | Americas | SLV | El Salvador | 2010 | 21000 | LAC | SLV | El Salvador | 2010 | ... | N | i | 1 | 3.586186 | 25.155506 | 4.172391 | 1.674593 | 9.223166 | 2.952473 | 6.253187 |
| 17 | AMR | Americas | SUR | Suriname | 2022 | 7200 | LAC | SUR | Suriname | 2022 | ... | N | c | 1 | 1.104312 | 7.032160 | 0.874065 | 0.315518 | 5.599063 | 2.421072 | 2.129987 |
| 18 | AMR | Americas | URY | Uruguay | 2010 | 9000 | LAC | URY | Uruguay | 2010 | ... | N | i | 3 | 0.112155 | 1.855258 | 0.482043 | 0.028709 | 1.953975 | 0.593860 | 0.148147 |
| 19 | EMR | Eastern Mediterranean | PAK | Pakistan | 2010 | 79000 | SAR | PAK | Pakistan | 2010 | ... | N | c | 1 | 4.932448 | 21.092634 | 28.823763 | 9.321731 | 24.799852 | 6.517883 | 16.667987 |
| 20 | EMR | Eastern Mediterranean | TUN | Tunisia | 2010 | 4400 | MNA | TUN | Tunisia | 2010 | ... | N | c | 2 | 0.254945 | 6.501850 | 1.307041 | 0.188030 | 0.613286 | 1.606651 | 0.411305 |
| 21 | EUR | Europe | ARM | Armenia | 2010 | 1800 | ECA | ARM | Armenia | 2010 | ... | N | c | 1 | 0.523521 | 0.000000 | 1.793004 | 0.000000 | 0.397725 | 0.660082 | 0.523521 |
| 22 | EUR | Europe | BLR | Belarus | 2010 | 16000 | ECA | BLR | Belarus | 2010 | ... | N | c | 1 | 0.000000 | 0.000000 | 3.045111 | 0.134809 | 4.596488 | 3.342947 | 3.157514 |
| 23 | EUR | Europe | DEU | Germany | 2010 | 65000 | ECA | DEU | Germany | 2010 | ... | N | I | 2 | 0.208680 | 2.113210 | 2.596350 | 0.000000 | 0.000000 | 2.421072 | 0.321820 |
| 24 | EUR | Europe | GEO | Georgia | 2010 | 2600 | ECA | GEO | Georgia | 2010 | ... | N | c | 2 | 5.485026 | 0.024344 | 1.142132 | 0.000000 | 8.917280 | 5.356134 | 5.493653 |
| 25 | EUR | Europe | HRV | Croatia | 2010 | 1200 | ECA | HRV | Croatia | 2010 | ... | N | i | 3 | 0.308004 | 0.145387 | 3.045111 | 0.000000 | 9.223166 | 0.000000 | 0.453452 |
| 26 | EUR | Europe | ISR | Israel | 2010 | 5900 | MNA | ISR | Israel | 2010 | ... | N | I | 0 | 0.347770 | 0.560530 | 0.536060 | 0.000000 | 0.000000 | 2.421072 | 0.347770 |
| 27 | EUR | Europe | KAZ | Kazakhstan | 2010 | 17000 | ECA | KAZ | Kazakhstan | 2010 | ... | N | c | 3 | 0.015141 | 0.003963 | 3.045111 | 0.000000 | 0.543433 | 0.724184 | 0.019104 |
| 28 | EUR | Europe | MKD | North Macedonia | 2010 | 199 | ECA | MKD | North Macedonia | 2010 | ... | N | i | 2 | 2.663351 | 0.410918 | 3.045111 | 0.000000 | 4.538547 | 2.421072 | 3.060215 |
| 29 | EUR | Europe | ROU | Romania | 2010 | 14000 | ECA | ROU | Romania | 2010 | ... | N | c | 2 | 0.000000 | 0.086624 | 1.302026 | 0.002732 | 14.135401 | 1.309375 | 0.012860 |
| 30 | EUR | Europe | SRB | Serbia | 2010 | 2000 | ECA | SRB | Serbia | 2010 | ... | N | c | 1 | 0.045232 | 1.718018 | 0.670786 | 0.142480 | 1.480976 | 0.091316 | 0.233548 |
| 31 | EUR | Europe | UKR | Ukraine | 2010 | 250000 | ECA | UKR | Ukraine | 2010 | ... | N | c | 4 | 0.028826 | 1.645547 | 3.045111 | 0.000000 | 12.413940 | 0.000000 | 1.674372 |
| 32 | EUR | Europe | UZB | Uzbekistan | 2022 | 59000 | ECA | UZB | Uzbekistan | 2022 | ... | N | c | 1 | 2.253092 | 0.000000 | 3.045111 | 0.127470 | 21.786885 | 10.693686 | 2.253092 |
| 33 | SEAR | South-East Asia | BGD | Bangladesh | 2010 | 7200 | SAR | BGD | Bangladesh | 2010 | ... | N | c | 2 | 5.008757 | 9.497152 | 6.686347 | 2.374333 | 27.518305 | 1.998808 | 6.607784 |
| 34 | SEAR | South-East Asia | NPL | Nepal | 2010 | 34000 | SAR | NPL | Nepal | 2010 | ... | N | c | 2 | 0.366599 | 13.394071 | 3.296023 | 5.949776 | 5.567963 | 2.919005 | 2.171141 |
| 35 | WPR | Western Pacific | AUS | Australia | 2010 | 20000 | EAP | AUS | Australia | 2010 | ... | N | I | 3 | 0.516880 | 1.711880 | 3.045111 | 0.000000 | 0.000000 | 2.421072 | 2.215770 |
| 36 | WPR | Western Pacific | MNG | Mongolia | 2010 | 499 | EAP | MNG | Mongolia | 2010 | ... | N | c | 3 | 0.223410 | 1.140594 | 4.088683 | 0.070097 | 43.712702 | 8.935307 | 1.123338 |
| 37 | WPR | Western Pacific | VNM | Viet Nam | 2010 | 210000 | EAP | VNM | Viet Nam | 2010 | ... | N | c | 2 | 0.963795 | 3.384816 | 1.841407 | 0.079733 | 4.132901 | 1.968127 | 1.266184 |
38 rows × 22 columns
# remove repetive columns
x = ['Region', 'Country code', 'Economy', 'Period']
merged_df = merged_df.drop(columns=x)
merged_df = merged_df.rename(columns={'SpatialDimValueCode': 'Country code'})
merged_df
| ParentLocationCode | ParentLocation | Country code | Location | Value | Reporting year | Survey name | Survey year | Survey coverage | Welfare type | Survey comparability | Monetary (%) | Educational attainment (%) | Educational enrollment (%) | Electricity (%) | Sanitation (%) | Drinking water (%) | Multidimensional poverty headcount ratio (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFR | Africa | BEN | Benin | 71000 | 2015 | EHCVM | 2021 | N | c | 1 | 12.723279 | 49.023893 | 31.661823 | 34.818304 | 76.649594 | 24.109130 | 45.443240 |
| 1 | AFR | Africa | CIV | Cote d'Ivoire | 460000 | 2015 | EHCVM | 2021 | N | c | 2 | 9.733193 | 44.493997 | 24.717054 | 9.264950 | 59.909374 | 17.286547 | 29.177094 |
| 2 | AFR | Africa | GNB | Guinea-Bissau | 37000 | 2010 | EHCVM | 2021 | N | c | 3 | 25.962856 | 20.071073 | 31.059781 | 27.570510 | 60.362864 | 20.897064 | 38.730049 |
| 3 | AFR | Africa | KEN | Kenya | 1500000 | 2015 | KCHS | 2021 | N | c | 3 | 36.146057 | 10.055726 | 1.179968 | 24.500306 | 22.312516 | 36.060327 | 38.490102 |
| 4 | AFR | Africa | MWI | Malawi | 930000 | 2010 | IHS-V | 2019 | N | c | 1 | 70.060599 | 54.329389 | 3.686354 | 88.757396 | 75.137657 | 11.400399 | 78.252000 |
| 5 | AFR | Africa | ZMB | Zambia | 940000 | 2010 | LCMS-VIII | 2022 | N | c | 4 | 64.341974 | 16.267821 | 23.398350 | 45.135146 | 53.505135 | 26.849246 | 66.506058 |
| 6 | AMR | Americas | ARG | Argentina | 92000 | 2010 | EPHC-S2 | 2021 | U | i | 3 | 0.894218 | 1.085320 | 0.731351 | 0.000000 | 0.257453 | 0.364048 | 0.906573 |
| 7 | AMR | Americas | COL | Colombia | 140000 | 2010 | GEIH | 2021 | N | i | 4 | 7.334666 | 5.054273 | 2.794198 | 1.110965 | 7.673390 | 1.721011 | 7.698172 |
| 8 | AMR | Americas | CRI | Costa Rica | 9200 | 2010 | ENAHO | 2021 | N | i | 3 | 1.242194 | 3.650702 | 0.452292 | 0.261438 | 1.557860 | 0.252125 | 1.320344 |
| 9 | AMR | Americas | DOM | Dominican Republic | 74000 | 2010 | ECNFT-Q03 | 2021 | N | i | 5 | 0.853729 | 12.177467 | 5.423051 | 0.474999 | 5.528953 | 4.678269 | 1.813273 |
| 10 | AMR | Americas | ECU | Ecuador | 31000 | 2010 | ENEMDU | 2021 | N | i | 7 | 3.583180 | 2.962833 | 2.339824 | 1.611026 | 4.997954 | 3.688032 | 4.252161 |
| 11 | AMR | Americas | HND | Honduras | 25000 | 2010 | EPHPM | 2019 | N | i | 4 | 12.647659 | 10.125883 | 9.975495 | 6.744803 | 5.810140 | 5.680016 | 14.782456 |
| 12 | AMR | Americas | MEX | Mexico | 210000 | 2010 | ENIGHNS | 2022 | N | i | 0 | 1.179679 | 3.724001 | 2.672050 | 0.320671 | 10.676579 | 4.353976 | 1.740318 |
| 13 | AMR | Americas | PAN | Panama | 19000 | 2010 | EH | 2021 | N | i | 5 | 1.108257 | 2.424547 | 1.619710 | 4.764825 | 6.176661 | 4.484637 | 2.445591 |
| 14 | AMR | Americas | PER | Peru | 61000 | 2010 | ENAHO | 2021 | N | i | 3 | 2.773094 | 5.365242 | 1.210119 | 4.144808 | 12.159352 | 5.256491 | 4.120150 |
| 15 | AMR | Americas | PRY | Paraguay | 11000 | 2010 | EPH | 2021 | N | i | 3 | 0.689179 | 4.447169 | 1.995930 | 0.134809 | 10.389888 | 1.107492 | 1.321316 |
| 16 | AMR | Americas | SLV | El Salvador | 21000 | 2010 | EHPM | 2021 | N | i | 1 | 3.586186 | 25.155506 | 4.172391 | 1.674593 | 9.223166 | 2.952473 | 6.253187 |
| 17 | AMR | Americas | SUR | Suriname | 7200 | 2022 | SSLC | 2022 | N | c | 1 | 1.104312 | 7.032160 | 0.874065 | 0.315518 | 5.599063 | 2.421072 | 2.129987 |
| 18 | AMR | Americas | URY | Uruguay | 9000 | 2010 | ECH-S2 | 2021 | N | i | 3 | 0.112155 | 1.855258 | 0.482043 | 0.028709 | 1.953975 | 0.593860 | 0.148147 |
| 19 | EMR | Eastern Mediterranean | PAK | Pakistan | 79000 | 2010 | HIES | 2018 | N | c | 1 | 4.932448 | 21.092634 | 28.823763 | 9.321731 | 24.799852 | 6.517883 | 16.667987 |
| 20 | EMR | Eastern Mediterranean | TUN | Tunisia | 4400 | 2010 | NSHBCSL | 2021 | N | c | 2 | 0.254945 | 6.501850 | 1.307041 | 0.188030 | 0.613286 | 1.606651 | 0.411305 |
| 21 | EUR | Europe | ARM | Armenia | 1800 | 2010 | ILCS | 2021 | N | c | 1 | 0.523521 | 0.000000 | 1.793004 | 0.000000 | 0.397725 | 0.660082 | 0.523521 |
| 22 | EUR | Europe | BLR | Belarus | 16000 | 2010 | HHS | 2019 | N | c | 1 | 0.000000 | 0.000000 | 3.045111 | 0.134809 | 4.596488 | 3.342947 | 3.157514 |
| 23 | EUR | Europe | DEU | Germany | 65000 | 2010 | GSOEP-LIS | 2020 | N | I | 2 | 0.208680 | 2.113210 | 2.596350 | 0.000000 | 0.000000 | 2.421072 | 0.321820 |
| 24 | EUR | Europe | GEO | Georgia | 2600 | 2010 | HIS | 2021 | N | c | 2 | 5.485026 | 0.024344 | 1.142132 | 0.000000 | 8.917280 | 5.356134 | 5.493653 |
| 25 | EUR | Europe | HRV | Croatia | 1200 | 2010 | EU-SILC | 2022 | N | i | 3 | 0.308004 | 0.145387 | 3.045111 | 0.000000 | 9.223166 | 0.000000 | 0.453452 |
| 26 | EUR | Europe | ISR | Israel | 5900 | 2010 | HES-LIS | 2021 | N | I | 0 | 0.347770 | 0.560530 | 0.536060 | 0.000000 | 0.000000 | 2.421072 | 0.347770 |
| 27 | EUR | Europe | KAZ | Kazakhstan | 17000 | 2010 | HBS | 2018 | N | c | 3 | 0.015141 | 0.003963 | 3.045111 | 0.000000 | 0.543433 | 0.724184 | 0.019104 |
| 28 | EUR | Europe | MKD | North Macedonia | 199 | 2010 | SILC-C | 2020 | N | i | 2 | 2.663351 | 0.410918 | 3.045111 | 0.000000 | 4.538547 | 2.421072 | 3.060215 |
| 29 | EUR | Europe | ROU | Romania | 14000 | 2010 | HBS | 2021 | N | c | 2 | 0.000000 | 0.086624 | 1.302026 | 0.002732 | 14.135401 | 1.309375 | 0.012860 |
| 30 | EUR | Europe | SRB | Serbia | 2000 | 2010 | HBS | 2019 | N | c | 1 | 0.045232 | 1.718018 | 0.670786 | 0.142480 | 1.480976 | 0.091316 | 0.233548 |
| 31 | EUR | Europe | UKR | Ukraine | 250000 | 2010 | HLCS | 2020 | N | c | 4 | 0.028826 | 1.645547 | 3.045111 | 0.000000 | 12.413940 | 0.000000 | 1.674372 |
| 32 | EUR | Europe | UZB | Uzbekistan | 59000 | 2022 | HBS | 2022 | N | c | 1 | 2.253092 | 0.000000 | 3.045111 | 0.127470 | 21.786885 | 10.693686 | 2.253092 |
| 33 | SEAR | South-East Asia | BGD | Bangladesh | 7200 | 2010 | HIES | 2022 | N | c | 2 | 5.008757 | 9.497152 | 6.686347 | 2.374333 | 27.518305 | 1.998808 | 6.607784 |
| 34 | SEAR | South-East Asia | NPL | Nepal | 34000 | 2010 | LSS-IV | 2022 | N | c | 2 | 0.366599 | 13.394071 | 3.296023 | 5.949776 | 5.567963 | 2.919005 | 2.171141 |
| 35 | WPR | Western Pacific | AUS | Australia | 20000 | 2010 | SIH-LIS | 2018 | N | I | 3 | 0.516880 | 1.711880 | 3.045111 | 0.000000 | 0.000000 | 2.421072 | 2.215770 |
| 36 | WPR | Western Pacific | MNG | Mongolia | 499 | 2010 | HSES | 2022 | N | c | 3 | 0.223410 | 1.140594 | 4.088683 | 0.070097 | 43.712702 | 8.935307 | 1.123338 |
| 37 | WPR | Western Pacific | VNM | Viet Nam | 210000 | 2010 | VHLSS | 2022 | N | c | 2 | 0.963795 | 3.384816 | 1.841407 | 0.079733 | 4.132901 | 1.968127 | 1.266184 |